R Markdown Setup Chunk

require(knitr)
## Loading required package: knitr
knitr::opts_chunk$set(echo = TRUE,warning = FALSE, message = FALSE)
knitr::opts_chunk$set(fig.width = 15, fig.height = 10)
knitr::opts_chunk$set(comment = NA)
# Change this working directory to be your location of the session 7 folder
knitr::opts_knit$set(root.dir="~/Desktop/code_club_curriculum/Session_7/") 

Agenda

  1. Set up the R Environment
  2. Load the .treefile Into the R Environment
  3. Perform Initial Visualization of the Phylogeny Using the ggtree Package
  4. Clean the Phylogeny by Removing Tips and Midpoint Rooting
  5. Explore Whether Subseting Tree Based on Sequence Type of Interest is Valuable
  6. Subset Tree Based on Sequence Type
  7. Visualize Colistin Resistance Using Gheatmap
  8. Using ggnewscale to plot heatmap with variables that have different data structures

Part 0: Set up the R Environment

Note: This code chunk loads the required packages for this code club session. Instead of using the library function consecutively for each package, I often use this lapply function strategy to load my packages at the start of my R markdown file.

# Packages used 
  # Cran packages
  cran_packages <- c("BiocManager","phytools","ggplot2","tidyverse","readxl","randomcoloR","cowplot","printr")
  # BiocManager packages
  BiocManager_packages <- c("ggtree","ggnewscale")
  
# Functions to install packages (if not installed already)
  # lapply(cran_packages,install.packages,character.only=T)
  # lapply(BiocManager_packages,BiocManager::install,character.only=T)

# Functions to load packages
lapply(cran_packages,library,character.only=T)
[[1]]
[1] "BiocManager" "knitr"       "stats"       "graphics"    "grDevices"  
[6] "utils"       "datasets"    "methods"     "base"       

[[2]]
 [1] "phytools"    "maps"        "ape"         "BiocManager" "knitr"      
 [6] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
[11] "methods"     "base"       

[[3]]
 [1] "ggplot2"     "phytools"    "maps"        "ape"         "BiocManager"
 [6] "knitr"       "stats"       "graphics"    "grDevices"   "utils"      
[11] "datasets"    "methods"     "base"       

[[4]]
 [1] "forcats"     "stringr"     "dplyr"       "purrr"       "readr"      
 [6] "tidyr"       "tibble"      "tidyverse"   "ggplot2"     "phytools"   
[11] "maps"        "ape"         "BiocManager" "knitr"       "stats"      
[16] "graphics"    "grDevices"   "utils"       "datasets"    "methods"    
[21] "base"       

[[5]]
 [1] "readxl"      "forcats"     "stringr"     "dplyr"       "purrr"      
 [6] "readr"       "tidyr"       "tibble"      "tidyverse"   "ggplot2"    
[11] "phytools"    "maps"        "ape"         "BiocManager" "knitr"      
[16] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
[21] "methods"     "base"       

[[6]]
 [1] "randomcoloR" "readxl"      "forcats"     "stringr"     "dplyr"      
 [6] "purrr"       "readr"       "tidyr"       "tibble"      "tidyverse"  
[11] "ggplot2"     "phytools"    "maps"        "ape"         "BiocManager"
[16] "knitr"       "stats"       "graphics"    "grDevices"   "utils"      
[21] "datasets"    "methods"     "base"       

[[7]]
 [1] "cowplot"     "randomcoloR" "readxl"      "forcats"     "stringr"    
 [6] "dplyr"       "purrr"       "readr"       "tidyr"       "tibble"     
[11] "tidyverse"   "ggplot2"     "phytools"    "maps"        "ape"        
[16] "BiocManager" "knitr"       "stats"       "graphics"    "grDevices"  
[21] "utils"       "datasets"    "methods"     "base"       

[[8]]
 [1] "printr"      "cowplot"     "randomcoloR" "readxl"      "forcats"    
 [6] "stringr"     "dplyr"       "purrr"       "readr"       "tidyr"      
[11] "tibble"      "tidyverse"   "ggplot2"     "phytools"    "maps"       
[16] "ape"         "BiocManager" "knitr"       "stats"       "graphics"   
[21] "grDevices"   "utils"       "datasets"    "methods"     "base"       
lapply(BiocManager_packages,library,character.only=T)
[[1]]
 [1] "ggtree"      "printr"      "cowplot"     "randomcoloR" "readxl"     
 [6] "forcats"     "stringr"     "dplyr"       "purrr"       "readr"      
[11] "tidyr"       "tibble"      "tidyverse"   "ggplot2"     "phytools"   
[16] "maps"        "ape"         "BiocManager" "knitr"       "stats"      
[21] "graphics"    "grDevices"   "utils"       "datasets"    "methods"    
[26] "base"       

[[2]]
 [1] "ggnewscale"  "ggtree"      "printr"      "cowplot"     "randomcoloR"
 [6] "readxl"      "forcats"     "stringr"     "dplyr"       "purrr"      
[11] "readr"       "tidyr"       "tibble"      "tidyverse"   "ggplot2"    
[16] "phytools"    "maps"        "ape"         "BiocManager" "knitr"      
[21] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
[26] "methods"     "base"       

Part 1: Load the .treefile Into the R Environment

# Note: If you choose to work in the console and not knit your .Rmd, set your working directory to YOUR location of the session 7 folder
setwd("~/Desktop/code_club_curriculum/Session_7/")
# Load .treefile using the ape package ()
tree <- read.tree(file="./2021_02_12_08_34_28_KPNIH1_genome_aln_w_alt_allele_unmapped.treefile")

# Question: What type of object is the .treefile?
# Answer: A list of the class "phylo." 
class(tree)
[1] "phylo"

Part 2: Perform Initial Visualization of the Tree Using ggtree

See the ggtree full vignette at “https://yulab-smu.top/treedata-book/

# Visualize tree object using the ggtree function
  # Help page documentation for ggtree function
  help(ggtree)
ggtree R Documentation

visualizing phylogenetic tree and heterogenous associated data based on grammar of graphics ggtree provides functions for visualizing phylogenetic tree and its associated data in R.

Description

If you use ggtree in published research, please cite: Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628

drawing phylogenetic tree from phylo object

Usage

ggtree(
  tr,
  mapping = NULL,
  layout = "rectangular",
  open.angle = 0,
  mrsd = NULL,
  as.Date = FALSE,
  yscale = "none",
  yscale_mapping = NULL,
  ladderize = TRUE,
  right = FALSE,
  branch.length = "branch.length",
  root.position = 0,
  xlim = NULL,
  ...
)

Arguments

tr

phylo object

mapping

aesthetic mapping

layout

one of ‘rectangular’, ‘dendrogram’, ‘slanted’, ‘ellipse’, ‘roundrect’, ‘fan’, ‘circular’, ‘inward_circular’, ‘radial’, ‘equal_angle’, ‘daylight’ or ‘ape’

open.angle

open angle, only for ‘fan’ layout

mrsd

most recent sampling date

as.Date

logical whether using Date class in time tree

yscale

y scale

yscale_mapping

yscale mapping for category variable

ladderize

logical (default TRUE). Should the tree be re-organized to have a ‘ladder’ aspect?

right

logical. If ladderize = TRUE, should the ladder have the smallest clade on the right-hand side? See ape::ladderize() for more information.

branch.length

variable for scaling branch, if ‘none’ draw cladogram

root.position

position of the root node (default = 0)

xlim

x limits, only works for ‘inward_circular’ layout

additional parameter

some dot arguments:

  • nsplit integer, the number of branch blocks divided when ‘continuous’ is not "none", default is 200.

Value

tree

Author(s)

Yu Guangchuang

References

  1. G Yu, TTY Lam, H Zhu, Y Guan (2018). Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution, 35(2):3041-3043. https://doi.org/10.1093/molbev/msy194

  2. G Yu, DK Smith, H Zhu, Y Guan, TTY Lam (2017). ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution, 8(1):28-36. https://doi.org/10.1111/2041-210X.12628

See Also

geom_tree()

Examples

require(ape) 
tr <- rtree(10)
ggtree(tr)
  # Use the ggtree function to view phylogeny
  ggtree(tree)

# Exercise: Add a (1) treescale using geom_treescale and (2) overlay a title using a ggplot2 functions 
ggtree(tree) + geom_treescale(x=0,y=-10,offset=1) + ggtitle("Initial Phylogeny")

# Question: After adding the scale to the phylogeny, what are your takeaways? Does this tell you anything about the genomic diversity of the isolates used in the tree? 

# Answer: Notice the differential branch lengths in the phylogeny, specifically that there are ~ 400 isolates with a smaller distance between edges. 

Part 3: Clean the Phylogeny by (1) Removing Tips and (2) Performing Midpoint Rooting

# Exercise: Identify the location of the tip labels for this tree
head(tree$tip.label)
[1] "gi_661922017_gb_CP008827.1_" "PCMP_H100"                  
[3] "PCMP_H98"                    "PCMP_H123"                  
[5] "PCMP_H141"                   "PCMP_H376"                  
# Drop Reference group using the treeio package's drop.tip function
tree_wo_root <- treeio::drop.tip(tree,'gi_661922017_gb_CP008827.1_')

# View tree without reference group
ggtree(tree_wo_root)  + ggtitle("Tree Without the Reference Group")

# Question: Would midpoint rooting be valuable for this phylogeny? 
  #Answer: Midpoint root a phylogeny involves locating the longest path between any two tips and putting the root at that location
# Midpoint root using phytools rooting tool
tree_final <- phytools::midpoint.root(tree_wo_root)

# View midpoint rooted tree again
ggtree(tree_final)  + ggtitle("Midpoint Rooted Tree")

# Question: What is the difference in the tree_wo_root and tree_final? 
  # Plot the ggtree images side-by-side using the cowplot package's plot_grid function
  plot_grid(ggtree(tree_wo_root), ggtree(tree_final), ncol=2,labels = c('Without Midpoint Rooting','Midpoint Root'))

Part 4: Explore Whether Subseting the Tree on Sequence Type of Interest is Valuable

# Import CRKP Metadata
CRKP_metadata <- as.data.frame(read_xlsx('./CRKP_metadata.xlsx'))

# Exercise: Characterize the dataset
  #1. What are the variables in this dataframe?
    # Answer: isolate_no, LTACH, Region, Source, ST (Sequence Type), and Colistin (Susceptibility)
  names(CRKP_metadata)
[1] "isolate_no" "LTACH"      "Region"     "Source"     "ST"        
[6] "Colistin"  
  #2. How many isolates are in this dataframe?
    # Hint: Each row = a unique isolate
    # Answer: 458 isolates
  nrow(CRKP_metadata)
[1] 458
  #3. Does the number of isolates in the metadata dataframe equal the number of isolates in the phylogeny?
    # Answer: No. Not all isolates were sequenced and included in the phylogeny. 
  nrow(CRKP_metadata) == tree$Nnode
[1] FALSE
  #4. Does the row names of the dataframe match the tip labels in the tree? 
    # Answer: No. We must ensure that row names of the dataframe match the tip labels of the phylogeny to ensure proper mapping of the data to the phylogeny. 
  head(rownames(CRKP_metadata))
[1] "1" "2" "3" "4" "5" "6"
  head(tree_final$tip.label)
[1] "PCMP_H92"  "PCMP_H366" "PCMP_H100" "PCMP_H98"  "PCMP_H123" "PCMP_H141"
# Create rownames for sorting the dataframe to match the tiplabels of the tree
rownames(CRKP_metadata) <- paste0("PCMP_H",CRKP_metadata$isolate_no)
  
# Subset and sort the dataframe using the match function
CRKP_metadata_final <- as.data.frame(CRKP_metadata[match(as.vector(tree_final$tip.label), row.names(CRKP_metadata)), ])

# Question: What is the distribution of sequence types within the dataset?
  # Data table:
table(CRKP_metadata_final$ST)
107 15 15* 17 1832 193 2237 258 307 34 345 36 418 ND Novel*
1 10 2 2 1 1 1 411 12 1 1 2 1 6 1
  # Bar plot  
ggplot(data = CRKP_metadata_final, aes(x = ST)) +
    geom_bar()

# Add tips to the tree to visualize distribution of sequence types across the phylogeny
  # Create tip for sequence type
  ST_tip <- c(CRKP_metadata_final$ST,rep(NA,tree_final$Nnode))
  # Visualize tree with sequence type as tips 
  ggtree(tree_final) +  geom_tippoint(aes(color=ST_tip,legend_title="Sequence Types"))

  # Question: How is sequence type distributed across the phylogeny?
    # Answer: The clade with a relatively lower distances between tips are predominantly ST258 isolates

Part 5: Subset Tree Based on Sequence Type

# Subset the tree to include only ST258 isolates
  # Create ST258 dataframe
  CRKP_258 <- subset(CRKP_metadata_final,ST == "258")
  # Subset phylogeny to include only ST258 isolates
  tree_258 <- drop.tip(tree_final,tree_final$tip.label[!tree_final$tip.label %in% rownames(CRKP_258)])
  # Sort dataframe by tip label
  CRKP_258_sorted <- CRKP_258[match(tree_258$tip.label, rownames(CRKP_258)), ] 
  
# Visualize ST258 phylogeny
ggtree(tree_258)

# Visualize ST258 phylogeny using different layouts
  # Default (layout = "rectangular")
  ggtree(tree_258)

  # Circular
  ggtree(tree_258,layout = "circular")

  # Slanted
  ggtree(tree_258,layout = "slanted")

  # Dendogran
  ggtree(tree_258,layout = "dendrogram")

  # Ape
  ggtree(tree_258,layout = "ape")

  # No Branch Length
  ggtree(tree_258,layout="rectangular",branch.length = 'none')

# Exercise 1: Create a tip label for Source
  # 1. Tabulate source data
  table(CRKP_258_sorted$Source)
Blood Respiratory Urine Wound
34 231 140 6
  # 2. Create tip for Source
  Source_tip <- c(CRKP_258_sorted$Source,rep(NA,tree_258$Nnode))
  # 3. Visualize
  ggtree(tree_258) +  geom_tippoint(aes(color=Source_tip),size=2) + labs(color="Source")

  # 4. Store as tree_258_source
  tree_258_source <- ggtree(tree_258) +  geom_tippoint(aes(color=Source_tip),size=2) + labs(color="Source")
  
# Exercise 2: Create a tip label for Colistin Resistance
  # 1. How many isolates are susceptible to colistin? 
    # Answer: 275 (66.9%)
  table(CRKP_258_sorted$Colistin)
Non-Susceptible Susceptible
136 275
  # 2. Create tip for colistin resistance
  Colistin <- c(CRKP_258_sorted$Colistin,rep(NA,tree_258$Nnode))
  # 3. Visualize and improve size
  ggtree(tree_258) +  geom_tippoint(aes(color=Colistin,legend_title="Colistin Resistance"),size=2)

# Exercise 3: Create a phylogeny with geom_tippoint (1) color = source and (2) shape = colistin resistance
  ggtree(tree_258) +  geom_tippoint(aes(color=Source_tip,shape=Colistin,legend_title="Region"),size=2)+ labs(color="Source")

Part 6: Visualize colistin Resistance Using gheatmap

# gheatmap Help Page
help(gheatmap)
gheatmap R Documentation

gheatmap

Description

append a heatmap of a matrix to right side of phylogenetic tree

Usage

gheatmap(
  p,
  data,
  offset = 0,
  width = 1,
  low = "green",
  high = "red",
  color = "white",
  colnames = TRUE,
  colnames_position = "bottom",
  colnames_angle = 0,
  colnames_level = NULL,
  colnames_offset_x = 0,
  colnames_offset_y = 0,
  font.size = 4,
  family = "",
  hjust = 0.5,
  legend_title = "value"
)

Arguments

p

tree view

data

matrix or data.frame

offset

offset of heatmap to tree

width

total width of heatmap, compare to width of tree

low

color of lowest value

high

color of highest value

color

color of heatmap cell border

colnames

logical, add matrix colnames or not

colnames_position

one of ‘bottom’ or ‘top’

colnames_angle

angle of column names

colnames_level

levels of colnames

colnames_offset_x

x offset for column names

colnames_offset_y

y offset for column names

font.size

font size of matrix colnames

family

font of matrix colnames

hjust

hjust for column names (0: align left, 0.5: align center, 1: align righ)

legend_title

title of fill legend

Value

tree view

Author(s)

Guangchuang Yu

# Initial gheatmap of Colistin Resistance (without formatting)
gheatmap(ggtree(tree_258),CRKP_258_sorted %>% select(Colistin))+ ylim(NA, 450) 

# Stepwise formatting of gheatmap
  # Change Heatmap Structure: Angle, Offset, Width, Justification, and Font Size
  gheatmap(ggtree(tree_258), CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + ylim(NA, 450)

  # Change Heatmap Fill Color and Legend Name: scale_fill_manual
  gheatmap(ggtree(tree_258), CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + ylim(NA, 450)

  # Change Legend Information: Theme 
  gheatmap(ggtree(tree_258), CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + theme(legend.position ='bottom',legend.key = element_rect(colour=c('black')), legend.direction="horizontal",legend.title = element_text(size=14),legend.text = element_text(size=8)) + ylim(NA, 475)  

# Exersise: Add stored ST258 phylogeny that has source tips to the gheatmap function created above
  # Hint: Format of gheatmap function = gheatmap(phylogeny, data, options)
gheatmap(tree_258_source, CRKP_258_sorted %>% select(Colistin), colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=5) + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + theme(legend.position ='bottom',legend.key = element_rect(colour=c('black')), legend.direction="horizontal",legend.title = element_text(size=14),legend.text = element_text(size=8)) + ylim(NA,475)   

Part 7: Using ggnewscale to plot heatmap with variables that have different data structures

# Exercise: Plot source and resistance (Colistin) in the same gheatmap
gheatmap(ggtree(tree_258),CRKP_258_sorted %>% select(Source,Colistin))+ ylim(NA, 450)

  # Question: What is 'wrong' about the figure above? How can you fix it? 
  # Answer: Note the issue with the legend. Each variable has different values. Thus, they should have different scales!

#Introducing: ggnewscale's new_scale_fill function!
help(new_scale_fill)
new_scale R Documentation

Adds a new scale to a plot

Description

Creates a new scale "slot". Geoms added to a plot after this function will use a new scale definition.

Usage

new_scale(new_aes)

new_scale_fill()

new_scale_color()

new_scale_colour()

Arguments

new_aes

A string with the name of the aesthetic for which a new scale will be created.

Details

new_scale_color(), new_scale_colour() and new_scale_fill() are just aliases to new_scale(“color”), etc…

Examples

library(ggplot2)

# Equivalent to melt(volcano), but we don't want to depend on reshape2
topography <- expand.grid(x = 1:nrow(volcano),
                          y = 1:ncol(volcano))
topography$z <- c(volcano)

# point measurements of something at a few locations
measurements <- data.frame(x = runif(30, 1, 80),
                           y = runif(30, 1, 60),
                           thing = rnorm(30))

ggplot(mapping = aes(x, y)) +
  geom_contour(data = topography, aes(z = z, color = stat(level))) +
  # Color scale for topography
  scale_color_viridis_c(option = "D") +
  # geoms below will use another color scale
  new_scale_color() +
  geom_point(data = measurements, size = 3, aes(color = thing)) +
  # Color scale applied to geoms added after new_scale_color()
  scale_color_viridis_c(option = "A")

# Generate of Unique Colors
  # Generate Distinct Color Palette Using the distinctColorPallete() function
    # Note: You can also use the randomColor function to generate a vector of random colors. 
  palette_source <- distinctColorPalette(4)
  # View Color Palette Chosen
  pie(rep(1,length(palette_source)),col=palette_source)

# Generate Heatmap Using ggnewscale
  # Phylogeny with Source Heatmap Column
  p1 <- gheatmap(ggtree(tree_258),CRKP_258_sorted %>% select(Source),colnames_position="top", colnames_angle=90, colnames_offset_y=0.25, width=.05,hjust=0,font.size=6) + scale_fill_manual(values=palette_source,name="Source") + ylim(NA,500)
  print(p1)

  #Use new_scale_fill to add optionc 
  p2 <- p1 + new_scale_fill()
  #Add Resistance Column to Heatmap
  p3 <- gheatmap(p2, CRKP_258_sorted %>% select(Colistin), colnames_position="top",  colnames_angle=90, colnames_offset_y=0.25, width=.05, hjust=0, font.size=6, offset=.0000018)  + theme(legend.position ='bottom',legend.key = element_rect(colour=c('black')), legend.direction="horizontal",legend.title = element_text(size=14),legend.text = element_text(size=8))+ geom_treescale(x=0,y=-15,offset=1)   + scale_fill_manual(values=c("black","white"),name="Resistance Profile") + ylim(NA, 500)
  print(p3)